# we import the required packages
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

df=pd.read_csv("cleaned_reviews_dataset.csv")

# we use the nltk package to get a set of stop words. We will remove these words from our review.
nltk.download("stopwords")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

def clean_review(review):
    """
    Function to remove stop words from the review
    """
    tokens=[word for word in review.split() if word not in stop_words]
    return " ".join(tokens)

# apply the clean_review function to the reviews
df["review"]=df["review"].apply(clean_review)
# and then remove the records where review becomes empty
df=df[df["review"]!=""]

df[df["review"]==""]

df.head()

df["review_length"]=df["review"].apply(len)

difficulty_words = {
   "casual", "relaxing", "laidback", "beginnerfriendly", "straightforward", "forgiving", "simple", "basic", "breezy", 
   "gentle", "accessible", "chill", "balanced", "fair", "moderate", "reasonable", "gradual", "steady", "manageable", 
   "evenpaced", "middleground", "standard", "tough", "tricky", "complex", "demanding", "unforgiving", "intense", "rage",
   "stressful", "hardcore", "thorough", "skillbased", "technical", "strategic", "brutal", "punishing",
   "merciless", "relentless", "grueling", "soulcrushing", "overwhelming", "formidable", "expertlevel", "masochistic",
   "impossible", "rageinducing", "frustrating", "challenging", "insane", "ridiculous", "extreme", "severe",
   "daunting", "backbreaking", "difficult", "hard", "harsh", "intensive", "rigorous", "rough",
   "savage", "strict", "taxing", "troublesome", "crazy", "deadly", "exhausting", "fierce",
   "hellish", "mindbending", "nightmarish", "painful", "ruthless", "tiresome", "tortuous", "unbearable", "vicious",
   "wicked", "crushing", "demonic", "diabolical", "maddening", "oppressive", "robust", "beastly", "bonkers", "hefty",
   "ez", "ezpz", "easymode", "hardmode", "tryhard", 
   "sweaty", "noobfriendly", "rekt", "rekted", "rip", 
   "megahard", "ultrahard", "gigahard", "omegahard", "kekw",
   "busted", "broken",
   "ggez", "2ez", "2easy", "normiefriendly", "casul", "filthycasual", 
   "bigbrain", "galaxybrain", "smurfing",
   "skillcheck", "skill check", "skillissue", "skill", "issue", "gitgud", "git", "gud", "getgood", "progamer",
   "omegalul", "boomerfriendly", "zoomer", "scrubfriendly", "nolife", "touchgrass", "touch grass"
}

def difficulty_word_count(review):
    """
    Function to count the no. of words describing difficulty in a review
    """
    count=0
    tokens=review.split()
    for word in difficulty_words:
        if word in tokens:
            count+=1
    return count

# use this function to derive the difficulty word_count column
df["difficulty_word_count"]=df["review"].apply(difficulty_word_count)

df.head()

df["mentions_difficulty"]=df["difficulty_word_count"].apply(lambda x: 1 if x>0 else 0)
df.head()

df['voted_up'] = df['voted_up'].astype("int64")

df.dtypes

game_name                         object
review                            object
voted_up                           int64
timestamp_created                  int64
author_num_games_owned             int64
author_num_reviews                 int64
author_playtime_at_review          int64
author_playtime_last_two_weeks     int64
author_playtime_forever            int64
review_length                      int64
difficulty_word_count              int64
mentions_difficulty                int64
dtype: object

# conver the timestamp column to the appropriate datatype
df['timestamp_created']=pd.to_datetime(df['timestamp_created'], unit='s')

df.dtypes

game_name                                 object
review                                    object
voted_up                                   int64
timestamp_created                 datetime64[ns]
author_num_games_owned                     int64
author_num_reviews                         int64
author_playtime_at_review                  int64
author_playtime_last_two_weeks             int64
author_playtime_forever                    int64
review_length                              int64
difficulty_word_count                      int64
mentions_difficulty                        int64
dtype: object

genre_mapping={
    "elden_ring" : "Soulslike Open_World RPG Third_Person".lower().split(),
    "sekiro" : "Soulslike Action Adventure Third_Person".lower().split(),
    "dark_souls_remastered" : "Soulslike Action RPG Third_Person".lower().split(),
    "armored_core_6" : "Third_Person Soulslike Action".lower().split(),
    "hollow_knight" : "Metroidvania Platformer Soulslike 2D".lower().split(),
    "hades" : "Roguelike Action".lower().split(),
    "dead_cells" : "Action Adventure Roguelike Metroidvania 2D".lower().split(),
    "slay_the_spire" : "Roguelike Deckbuilding Turn_Based".lower().split(),
    "returnal" : "Action Roguelike Co_op Third_Person Shooter".lower().split(),
    "risk_of_rain_2" : "Action Roguelike Third_Person Co_op Shooter".lower().split(),
    "witcher_3" : "Open_World RPG Adventure Fantasy".lower().split(),
    "mass_effect" : "RPG Action Third_Person Shooter".lower().split(),
    "divinity_original_sin_2" : "Turn_Based RPG Strategy CRPG Fantasy".lower().split(),
    "baldurs_gate_3" : "Turn_Based RPG CRPG Fantasy".lower().split(),
    "pillars_of_eternity": "RPG CRPG Fantasy".lower().split(),
    "portal_2" : "Platformer Puzzle First_Person".lower().split(),
    "the_witness" : "Puzzle First_Person Open_World".lower().split(),
    "celeste" : "Platformer 2D".lower().split(),
    "ori_and_the_blind_forest" : "Platformer 2D Metroidvania".lower().split(),
    "inside" : "Puzzle Platformer 2D".lower().split(),
    "stardew_valley" : "2D Sandbox Crafting Simulation".lower().split(),
    "factorio" : "2D Sandbox Crafting Simulation Base_Building Strategy Survival".lower().split(),
    "frostpunk" : "Base_Building Strategy Survival Simulation".lower().split(),
    "the_forest" : "Survival Open_World Crafting First_Person".lower().split(),
    "subnautica" : "Survival Open_World Crafting First_Person".lower().split(),
    "cod_modern_warfare" : "Action First_Person Shooter Multiplayer".lower().split(),
    "rocket_league" : "Multiplayer Competitive".lower().split(),
    "counter_strike_2" : "Multiplayer Competitive First_Person Shooter".lower().split(),
    "team_fortress_2" : "First_Person Shooter Multiplayer".lower().split(),
    "dota_2" : "Multiplayer Strategy Competitive".lower().split()
}

for game in genre_mapping:
    print(f"{game} :      {genre_mapping[game]}")

elden_ring :      ['soulslike', 'open_world', 'rpg', 'third_person']
sekiro :      ['soulslike', 'action', 'adventure', 'third_person']
dark_souls_remastered :      ['soulslike', 'action', 'rpg', 'third_person']
armored_core_6 :      ['third_person', 'soulslike', 'action']
hollow_knight :      ['metroidvania', 'platformer', 'soulslike', '2d']
hades :      ['roguelike', 'action']
dead_cells :      ['action', 'adventure', 'roguelike', 'metroidvania', '2d']
slay_the_spire :      ['roguelike', 'deckbuilding', 'turn_based']
returnal :      ['action', 'roguelike', 'co_op', 'third_person', 'shooter']
risk_of_rain_2 :      ['action', 'roguelike', 'third_person', 'co_op', 'shooter']
witcher_3 :      ['open_world', 'rpg', 'adventure', 'fantasy']
mass_effect :      ['rpg', 'action', 'third_person', 'shooter']
divinity_original_sin_2 :      ['turn_based', 'rpg', 'strategy', 'crpg', 'fantasy']
baldurs_gate_3 :      ['turn_based', 'rpg', 'crpg', 'fantasy']
pillars_of_eternity :      ['rpg', 'crpg', 'fantasy']
portal_2 :      ['platformer', 'puzzle', 'first_person']
the_witness :      ['puzzle', 'first_person', 'open_world']
celeste :      ['platformer', '2d']
ori_and_the_blind_forest :      ['platformer', '2d', 'metroidvania']
inside :      ['puzzle', 'platformer', '2d']
stardew_valley :      ['2d', 'sandbox', 'crafting', 'simulation']
factorio :      ['2d', 'sandbox', 'crafting', 'simulation', 'base_building', 'strategy', 'survival']
frostpunk :      ['base_building', 'strategy', 'survival', 'simulation']
the_forest :      ['survival', 'open_world', 'crafting', 'first_person']
subnautica :      ['survival', 'open_world', 'crafting', 'first_person']
cod_modern_warfare :      ['action', 'first_person', 'shooter', 'multiplayer']
rocket_league :      ['multiplayer', 'competitive']
counter_strike_2 :      ['multiplayer', 'competitive', 'first_person', 'shooter']
team_fortress_2 :      ['first_person', 'shooter', 'multiplayer']
dota_2 :      ['multiplayer', 'strategy', 'competitive']

# we will make a set with every genre
all_genres=set(genre for genres in genre_mapping.values() for genre in genres)
all_genres

{'2d',
 'action',
 'adventure',
 'base_building',
 'co_op',
 'competitive',
 'crafting',
 'crpg',
 'deckbuilding',
 'fantasy',
 'first_person',
 'metroidvania',
 'multiplayer',
 'open_world',
 'platformer',
 'puzzle',
 'roguelike',
 'rpg',
 'sandbox',
 'shooter',
 'simulation',
 'soulslike',
 'strategy',
 'survival',
 'third_person',
 'turn_based'}

# we will directly one hot encode this genre feature instead of creating one column first
# we will initialize everything to 0 first
for genre in all_genres:
    df[genre]=0

# now for every game whichever genres were mapped, those columns are made 1
for game, genres in genre_mapping.items():
    for genre in genres:
        df.loc[df['game_name']==game, genre]=1

df.head()

# for every game we get the quartiles
game_quartiles = df.groupby('game_name')['author_playtime_forever'].quantile([0.25, 0.5, 0.75]).unstack()
game_quartiles.columns = ['lower_quartile', 'median', 'upper_quartile']

def classify_experience(row):
    """
    Function to classify the records based on player experience level
    """
    game = row['game_name']
    playtime = row['author_playtime_forever']
    games_owned = row['author_num_games_owned']
    lower_q = game_quartiles.loc[game, 'lower_quartile']
    upper_q = game_quartiles.loc[game, 'upper_quartile']
    median = game_quartiles.loc[game, 'median']

    if playtime < lower_q:
        return 'beginner'
    elif playtime > upper_q or games_owned > 30:
        return 'experienced'
    elif games_owned > 10 and playtime >= median:
        return 'experienced'
    else:
        return 'intermediate'

# apply this function to derive the experience level column
df['experience_level'] = df.apply(classify_experience, axis=1)
df["experience_level"].value_counts()

experience_level
experienced     17696
intermediate    14355
beginner        10686
Name: count, dtype: int64

df.head()

# then one hot encode this feature
df=pd.get_dummies(df, columns=['experience_level'], drop_first=True, dtype="int64")

df.head()

import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

True

sia = SentimentIntensityAnalyzer()

# Apply sentiment analysis to each review
df['sentiment_score'] = df['review'].apply(lambda x: sia.polarity_scores(x)['compound'])

df.to_csv("featured_reviews.csv",index=False)

	game_name	review	voted_up	timestamp_created	author_num_games_owned	author_num_reviews	author_playtime_at_review	author_playtime_last_two_weeks	author_playtime_forever
0	elden_ring	love game much someone wants collect everythin...	True	1729275153	326	16	8977	3957	8988
1	elden_ring	roll	True	1729275009	11	2	2422	1085	2513
2	elden_ring	laterally perfect every way bad thing optimiza...	True	1729270437	0	1	350	492	492
3	elden_ring	try finger hole	True	1729269896	0	1	6456	80	6536
4	elden_ring	damn damn explain piece art pure phenomenal	True	1729268993	51	3	5563	1424	5563

	game_name	review	voted_up	timestamp_created	author_num_games_owned	author_num_reviews	author_playtime_at_review	author_playtime_last_two_weeks	author_playtime_forever	review_length
0	elden_ring	love game much someone wants collect everythin...	True	1729275153	326	16	8977	3957	8988	100
1	elden_ring	roll	True	1729275009	11	2	2422	1085	2513	4
2	elden_ring	laterally perfect every way bad thing optimiza...	True	1729270437	0	1	350	492	492	55
3	elden_ring	try finger hole	True	1729269896	0	1	6456	80	6536	15
4	elden_ring	damn damn explain piece art pure phenomenal	True	1729268993	51	3	5563	1424	5563	43

	game_name	review	voted_up	timestamp_created	author_num_games_owned	author_num_reviews	author_playtime_at_review	author_playtime_last_two_weeks	author_playtime_forever	review_length
0	elden_ring	love game much someone wants collect everythin...	True	1729275153	326	16	8977	3957	8988	100
1	elden_ring	roll	True	1729275009	11	2	2422	1085	2513	4
2	elden_ring	laterally perfect every way bad thing optimiza...	True	1729270437	0	1	350	492	492	55
3	elden_ring	try finger hole	True	1729269896	0	1	6456	80	6536	15
4	elden_ring	damn damn explain piece art pure phenomenal	True	1729268993	51	3	5563	1424	5563	43

	game_name	review	voted_up	timestamp_created	author_num_games_owned	author_num_reviews	author_playtime_at_review	author_playtime_last_two_weeks	author_playtime_forever	review_length	...	experience_level_experienced	experience_level_intermediate
0	elden_ring	love game much someone wants collect everythin...	1	2024-10-18 18:12:33	326	16	8977	3957	8988	100	...	1	0
1	elden_ring	roll	1	2024-10-18 18:10:09	11	2	2422	1085	2513	4	...	0	0
2	elden_ring	laterally perfect every way bad thing optimiza...	1	2024-10-18 16:53:57	0	1	350	492	492	55	...	0	0
3	elden_ring	try finger hole	1	2024-10-18 16:44:56	0	1	6456	80	6536	15	...	0	1
4	elden_ring	damn damn explain piece art pure phenomenal	1	2024-10-18 16:29:53	51	3	5563	1424	5563	43	...	1	0